Creating new pipeline using seurat v4.0.2 available 2021.06.08

Load libraries required for Seuratv4

knitr::opts_knit$set(root.dir = "~/Desktop/10XGenomicsData/msAggr_scRNASeq/")
library(dplyr)
library(Seurat)
library(patchwork)
library(ggplot2)
# library(clustree)

store session info

# sink("CMP-v1.20210616.txt")
sessionInfo()
# sink()

Set global variables

projectName <- "CMP"
jackstraw.dim <- 40
source("msAggr_AnalysisCode/read_10XGenomics_data.R")
source("msAggr_AnalysisCode/PercentVariance.R")
source("msAggr_AnalysisCode/Mouse2Human_idconversion.R")
setwd("../cellRanger/") # temporarily changing wd only works if you run the entire chunk at once
data_file.list <- read_10XGenomics_data(sample.list = "CMPm2")
object.data <-Read10X(data_file.list)
cmp.object<- CreateSeuratObject(counts = object.data, min.cells = 3, min.genes = 200, project = projectName)

Clean up to free memory

remove(object.data)

Add mitochondrial metadata and plot some basic features

cmp.object[["percent.mt"]] <- PercentageFeatureSet(cmp.object, pattern = "^mt-")
VlnPlot(cmp.object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0, fill.by = 'orig.ident', )
plot1 <- FeatureScatter(cmp.object, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident", pt.size = 0.01)
plot2 <- FeatureScatter(cmp.object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident", pt.size = 0.01)
plot1 + plot2

We don’t have to worry about comparing library depths, so we’ll just do normalization/Scale data

remove low quality cells require: nFeature_RNA between 200 and 4000 (inclusive) require: percent.mt <=5

print(paste("original object:", nrow(cmp.object@meta.data), "cells", sep = " "))
cmp.object <- subset(cmp.object, 
                                                subset = nFeature_RNA >=200 & 
                                                    nFeature_RNA <= 4000 & 
                                                    percent.mt <= 5
                                                )
print(paste("new object:", nrow(cmp.object@meta.data), "cells", sep = " "))
cmp.object <- NormalizeData(cmp.object, normalization.method = "LogNormalize", scale.factor = 10000)

Find variable features

cmp.object <- FindVariableFeatures(cmp.object, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(cmp.object), 10)
plot1 <- VariableFeaturePlot(cmp.object)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

Scale data (linear transformation)

Save raw object

saveRDS(cmp.object, file = paste0(projectName, "_raw.RDS"))
cmp.object <- RunPCA(cmp.object, features = VariableFeatures(cmp.object), ndims.print = 1:5, nfeatures.print = 5)
DimPlot(cmp.object, reduction = "pca", group.by = "orig.ident")
VizDimLoadings(cmp.object, dims = 1:6, nfeatures = 10, reduction = "pca", ncol = 3)

Calculate dimensionality

ElbowPlot(cmp.object, ndims = 50)
percent.variance(cmp.object@reductions$pca@stdev)

Number of PCs describing X% of variance

tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
paste0("Num pcs for 80% variance:", length(which(cumsum(tot.var) <= 80)))
paste0("Num pcs for 85% variance:", length(which(cumsum(tot.var) <= 85)))
paste0("Num pcs for 90% variance:", length(which(cumsum(tot.var) <= 90)))
paste0("Num pcs for 95% variance:", length(which(cumsum(tot.var) <= 95)))

Add cluster IDs from Seurat v1

Exported cell IDs for clusters 3, 17, 10, 11 from Seurat v1. Will add these IDs as a metadata column.
Create column “clust.ID” and populate with 0’s. Then import IDs for clusters

clust3.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster3cellIDs.txt", col.names = "clust03")
clust3.cells <- sapply(clust3.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
clust17.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster17cellIDs.txt", col.names = "clust17")
clust17.cells <- sapply(clust17.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
clust10.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster10cellIDs.txt", col.names = "clust10")
clust10.cells <- sapply(clust10.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
clust11.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster11cellIDs.txt", col.names = "clust11")
clust11.cells <- sapply(clust11.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))

Add new metadata column

cmp.object@meta.data['clust.ID'] <- 0
head(cmp.object@meta.data)

now map new ids

cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust3.cells] <- 3
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust17.cells] <- 17
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust10.cells] <- 10
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust11.cells] <- 11

do numbers make sense?

nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 10,])
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 11,])
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 17,])
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 3,])

Color palette

color.palette <- c(
    "coral",
    "chartreuse4",
    "goldenrod1",
    "cadetblue1",
    "burlywood",
    "brown",
    "brown1",
    "blue",
    "blue4",
    "azure3",
    "aquamarine",
    "antiquewhite",
    "cadetblue",
    "gold3",
    "black",
    "darkgreen",
    "deeppink",
    "darkviolet",
    "darkturquoise",
    "darkslategray",
    "darksalmon",
    "darkorchid1",
    "darkolivegreen2",
    "forestgreen",
    "dodgerblue",
    "green",
    "lightpink",
    "lightcoral",
    "khaki1",
    "maroon",
    "peru",
    "lightseagreen",
    "lightsalmon",
    "plum",
    "moccasin",
    "tan",
    "tan1", 
    "red", 
    "purple",
    "khaki4",
    "black", 
    "plum4"
)

Total var 90%

Neighborhood and umap

set total.var <- 90%

color.palette <- c(
    "coral",
    "chartreuse4",
    "goldenrod1",
    "cadetblue1",
    "burlywood",
    "brown",
    "brown1",
    "blue",
    "blue4",
    "azure3",
    "aquamarine",
    "antiquewhite",
    "cadetblue",
    "gold3",
    "black",
    "darkgreen",
    "deeppink",
    "darkviolet",
    "darkturquoise",
    "darkslategray",
    "darksalmon",
    "darkorchid1",
    "darkolivegreen2",
    "forestgreen",
    "dodgerblue",
    "green",
    "lightpink",
    "lightcoral",
    "khaki1",
    "maroon",
    "peru",
    "lightseagreen",
    "lightsalmon",
    "plum",
    "moccasin",
    "tan",
    "tan1", 
    "red", 
    "purple",
    "khaki4",
    "black", 
    "plum4"
)

Plot UMAP

cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
Computing nearest neighbor graph
Computing SNN
cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
Computing nearest neighbor graph
Computing SNN
cmp.object <- FindClusters(cmp.object, resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12059
Number of edges: 452999

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8502
Number of communities: 9
Elapsed time: 1 seconds
cmp.object <- RunUMAP(cmp.object, dims = 1: ndims)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
15:09:54 UMAP embedding parameters a = 0.9922 b = 1.112
15:09:54 Read 12059 rows and found 33 numeric columns
15:09:54 Using Annoy for neighbor search, n_neighbors = 30
15:09:54 Building Annoy index with metric = cosine, n_trees = 50
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:09:55 Writing NN index file to temp file /var/folders/4f/fwrj6fnn1dn4g8wsf0zv563hjsvl24/T//RtmpYaUTrq/file158c04b44b74a
15:09:55 Searching Annoy index using 1 thread, search_k = 3000
15:09:58 Annoy recall = 100%
15:09:59 Commencing smooth kNN distance calibration using 1 thread
15:09:59 Initializing from normalized Laplacian + noise
15:10:00 Commencing optimization for 200 epochs, with 513294 positive edges
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
15:10:06 Optimization finished
for(x in c(0.5, 1, 1.5, 2, 2.5)){
    cmp.object <- FindClusters(cmp.object, resolution = x)
}
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12059
Number of edges: 452999

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8502
Number of communities: 9
Elapsed time: 1 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12059
Number of edges: 452999

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.7974
Number of communities: 17
Elapsed time: 1 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12059
Number of edges: 452999

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.7652
Number of communities: 22
Elapsed time: 1 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12059
Number of edges: 452999

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.7404
Number of communities: 27
Elapsed time: 1 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 12059
Number of edges: 452999

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.7189
Number of communities: 30
Elapsed time: 1 seconds

for each resolution, number/percentage of cells in each cluster?

for (meta.col in colnames(cmp.object@meta.data)){
    if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
        myplot <- DimPlot(cmp.object, 
                                            group.by = meta.col,
                                            reduction = "umap", 
                                            cols = color.palette
                                            ) + 
            ggtitle(paste0(projectName, " dim", ndims, "res", gsub("RNA_snn_res", "", meta.col) ))
        plot(myplot)
    }
}

For each resolution, what percentage of cells in each cluster are enriched for one of our clust.IDs?

Test: what percentage of each new clusterID matches one of the older clusters?

tot.cells <- nrow(cmp.object@meta.data)
for (meta.col in colnames(cmp.object@meta.data)){
    if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
        new.clusters <- sort(as.numeric(levels(cmp.object@meta.data[[meta.col]])))
        stats.df <- data.frame(matrix(ncol = 2, nrow = length(new.clusters)))
        colnames(stats.df) <- c("num_cells", "pct_pop")
        rownames(stats.df) <- new.clusters
        meta.df <- cmp.object@meta.data
        for(row.id in rownames(stats.df)){
                num.x <- nrow(meta.df[meta.df[meta.col] == row.id,])
                pct.x <- as.integer(num.x / tot.cells *100)
                # print(pct.x)
                stats.df[row.id, "num_cells"] <- num.x
                stats.df[row.id, "pct_pop"] <- pct.x
        }
        print(stats.df)
    }
}

Absolutely terrible overlap, no enrichment of any of these across the new clustering algorithm. Maybe should try 95% variation covered

Find old cells on UMAP

time for the super scarey moment to see if the cells from seuratv1 still cluster together on in seurat v4

DimPlot(cmp.object,
                reduction = "umap",
                group.by = "clust.ID", 
                # split.by = "orig.ident",
                cols = c("gray", "orange", "blue", "red", "green"),)
DimPlot(cmp.object,
                reduction = "umap",
                group.by = "orig.ident", 
                split.by = "clust.ID",
                cols = c("gray", "orange", "blue", "red", "green"),)

Total var 95%

Neighborhood and umap

set total.var <- 95%

tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
ndims <- length(which(cumsum(tot.var) <= 95))

cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
cmp.object <- FindClusters(cmp.object, resolution = 0.5)
cmp.object <- RunUMAP(cmp.object, dims = 1: ndims)

saveRDS(cmp.object, file = paste0(projectName, "_dim", ndims, ".RDS"))

Plot UMAP

for(x in c(0.5, 1, 1.5, 2, 2.5)){
    cmp.object <- FindClusters(cmp.object, resolution = x)
}

For each resolution, what percentage of cells in each cluster are enriched for one of our clust.IDs?

Test: what percentage of each new clusterID matches one of the older clusters?

for (meta.col in colnames(cmp.object@meta.data)){
    if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
        new.clusters <- sort(as.numeric(levels(cmp.object@meta.data[[meta.col]])))
        enrich.df <- data.frame(matrix(ncol = 4, nrow = length(new.clusters)))
        colnames(enrich.df) <- c(3, 17, 10, 11)
        rownames(enrich.df) <- new.clusters
        meta.df <- cmp.object@meta.data
        for(row.id in rownames(enrich.df)){
            tot.clus <- nrow(meta.df[meta.df[[meta.col]] == row.id,])
            for(col.id in colnames(enrich.df)){
                num.x <- nrow(meta.df[(meta.df[[meta.col]] == row.id) & (meta.df$clust.ID == col.id),])
                pct.x <- as.integer(num.x / tot.clus *100)
                # print(pct.x)
                enrich.df[row.id, col.id] <- pct.x
            }
        }
        colnames(enrich.df) <- sapply(colnames(enrich.df), function(x) paste0("oldcluster", x))
        rownames(enrich.df) <- sapply(rownames(enrich.df), function(x) paste0("newcluster", x))
        xlsx::write.xlsx(enrich.df, file = paste0("PctOfNewClustersOverlappingOldClusters_", projectName, "_dim", ndims, ".xlsx"), sheetName = paste0(gsub("RNA_snn_", "", meta.col)), append = TRUE)
        print(enrich.df)
    }
}

Absolutely terrible overlap, no enrichment of any of these across the new clustering algorithm. Maybe should try 95% variation covered

Find old cells on UMAP

time for the super scarey moment to see if the cells from seuratv1 still cluster together on in seurat v4

DimPlot(cmp.object,
                reduction = "umap",
                group.by = "clust.ID", 
                pt.size = .1,
                # split.by = "orig.ident",
                cols = c("gray", "orange", "blue", "red", "green"),)
DimPlot(cmp.object,
                reduction = "umap",
                group.by = "orig.ident", 
                split.by = "clust.ID",
                cols = c("gray", "orange", "blue", "red", "green"),)

Gene expression of old clustrs on new map

Let’s see if we can get some gene expression profiles on these…

gene.list <- c("Gata1", "Gata2", "Pf4", "Dntt", "Mpo", "Meis1", "Irf8", "Elane", "Fli1", "Zfpm1")
VlnPlot(cmp.object, features = gene.list, group.by = "clust.ID", pt.size = 0.01, cols = c("gray", "orange", "blue", "red", "green"))

Used the exce doc to do some fancy conditional formatting. Old cluster 17 is pretty dispersed until you it resolution 2.5. Otherise, cells in old cluster 17 do not constitute more than 40% of any cells in the new clusters.
As far as I can see, the two approaches are to do DGEof new CMP w/ resolution = 2.5, AND/OR do DGe using older cluster IDs. Sure seems to make sense to do both…

DGE w/ resolution = 2.5

Strt with comparing all clusters against all other clusters Write out cluster info

calculate FindAllMarkers() for different idents and save to new file

ident.list <- c("RNA_snn_res.0.5", "RNA_snn_res.1", "RNA_snn_res.1.5", "RNA_snn_res.2", "RNA_snn_res.2.5", "clust.ID")
for(tested.ident in ident.list){
    Idents(cmp.object) <- tested.ident
    all.markers <- FindAllMarkers(cmp.object)
    xlsx::write.xlsx(x = all.markers[,c("avg_log2FC", "p_val_adj", "cluster", "gene")], 
                                     file = paste0(projectName, "_FindALLMarkers_res2.5.xlsx"), 
                                     sheetName = tested.ident, 
                                     col.names = TRUE, 
                                     row.names = FALSE, 
                                     append = TRUE)
}

Create FindAllMarkers() lists for GSEA

Idents(cmp.object) <- "RNA_snn_res.2.5"
res.2.5.allmarkers <- FindAllMarkers(cmp.object)

Map HGNC symbols

Mouse2HumanTable <- Mouse2Human(res.2.5.allmarkers$gene)

HGNC <- with(Mouse2HumanTable, Mouse2HumanTable$HGNC[match(res.2.5.allmarkers$gene, Mouse2HumanTable$MGI)])
head(res.2.5.allmarkers)
res.2.5.allmarkers$HGNC <- HGNC
tail(res.2.5.allmarkers)
sig.res.2.5 <- res.2.5.allmarkers[res.2.5.allmarkers$p_val_adj <= 0.05, ]
sig.res.2.5 <- sig.res.2.5[c("avg_log2FC", "HGNC", "cluster")]
sig.res.2.5 <- sig.res.2.5[!(sig.res.2.5$HGNC == "" | is.na(sig.res.2.5$HGNC)),] # GSEA will fail if there are any blanks or NAs in the table
sig.res.2.5 <- sig.res.2.5[]
for(cluster in unique(sig.res.2.5$cluster)){
    print(paste("writing cluster", cluster))
    new.table <- sig.res.2.5[sig.res.2.5$cluster == cluster, c("HGNC", "avg_log2FC")]
    new.table <- new.table[order(-new.table$avg_log2FC), ]
    write.table(new.table, file = paste0("RankList_res2.5_findAll_hgnc/res.2.5cluster", cluster, ".rnk"), quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t", )
    
}

calculate FindMarkers() that distinguish each cluster (might overlab between clusters)

ident.list <- c("RNA_snn_res.0.5", "RNA_snn_res.1", "RNA_snn_res.1.5", "RNA_snn_res.2", "RNA_snn_res.2.5", "clust.ID")
for(tested.ident in ident.list){
    for (cluster in sort(as.numeric(levels(cmp.object@meta.data[[tested.ident]])))){
    cluster.markers <- FindMarkers(cmp.object, ident.1 = cluster)
    xlsx::write.xlsx(x = cluster.markers[,c("avg_log2FC", "p_val_adj")], 
                                     file = paste0(projectName, "_FindMarkers_", gsub("RNA_snn_", "", tested.ident), ".xlsx"), 
                                     sheetName = paste0("clst", cluster), 
                                     col.names = TRUE, 
                                     row.names = TRUE, 
                                     append = TRUE)
}
}
for (cluster in sort(as.numeric(levels(cmp.object@meta.data$RNA_snn_res.2.5)))){
    cluster.markers <- FindMarkers(cmp.object, ident.1 = cluster)
    xlsx::write.xlsx(x = cluster.markers[,c("avg_log2FC", "p_val_adj")], 
                                     file = paste0(projectName, "_FindMarkers_res2.5.xlsx"), 
                                     sheetName = paste0("clst", cluster), 
                                     col.names = TRUE, 
                                     row.names = TRUE, 
                                     append = TRUE)
}

Combine clusters that might represent old cluster ids

DGE w/ metadata against clust.ID against “0”

reset ident as “clust.ID” and rerun FindAllMarkers()

    Idents(cmp.object) <- "clust.ID"
    all.markers <- FindAllMarkers(cmp.object)
    xlsx::write.xlsx(x = all.markers[,c("avg_log2FC", "p_val_adj", "cluster", "gene")], 
                                     file = paste0(projectName, "_FindALLMarkers_clustID.xlsx"), 
                                     sheetName = "clustID", 
                                     col.names = TRUE, 
                                     row.names = FALSE, 
                                     append = TRUE)
# Idents(cmp.object) <- "clust.ID"
for (cluster in unique(cmp.object@meta.data$clust.ID)){
    print(cluster)
    cluster.markers <- FindMarkers(cmp.object, ident.1 = cluster)
    xlsx::write.xlsx(x = cluster.markers[,c("avg_log2FC", "p_val_adj")], 
                                     file = paste0(projectName, "_FindMarkers_clustID.xlsx"), 
                                     sheetName = paste0("oldclust", cluster), 
                                     col.names = TRUE, 
                                     row.names = TRUE, 
                                     append = TRUE)
}

Distinguishing features of clusters

Previously defined biomark genes based on PC contributions. Original list was based on all msAggr, but let’s see how CMP subset does?

VizDimLoadings(cmp.object, dims = 1:10, nfeatures = 30, reduction = "pca", ncol = 2)
pca.df <- cmp.object[["pca"]]
pca.df <- as.data.frame(as.matrix(slot(object = pca.df, name = "feature.loadings")))
print(cmp.object[["pca"]], dims = 2, nfeatures = 5)
rownames(pca.df[pca.df$PC_2 %in% sort(pca.df$PC_2, decreasing = TRUE)[1:5], ])
rownames(pca.df[pca.df$PC_2 %in% sort(pca.df$PC_2)[1:5], ])

now we can get a list of principal components!
first pull the list of oldAnalysis CMP top PC genes

cmp.biomark <- read.table(file = "/Users/heustonef/Desktop/CMPSubpops/BioMark/ProbePanels/CMP_PCTopGenes.txt", sep = "\t", header = TRUE)
biomark.cmptargets <- c()
for(df.col in 1:ncol(cmp.biomark)){
    biomark.cmptargets <- c(biomark.cmptargets, biomark[,df.col])
}
print(colnames(biomark))
print(paste("total gene count:", length(biomark.cmptargets)))

Now get the list of current pc gene trgets (oldAnalysis used ndim = 1:6, so we’ll start with that range)

pc.list <- c("PC_1", "PC_2", "PC_3", "PC_4", "PC_5", "PC_6")
pc.genes <- lapply(pc.list, function(x) rownames(pca.df[pca.df[[x]] %in% sort(pca.df[[x]], decreasing = TRUE)[1:30],])) #targeting roughly 180 genes like in biomark.cmptargets
pc.genes <- unique(unlist(pc.genes))
print(paste("total gene count:", length(pc.genes)))

Now compare the lists, I guess:

# setdiff(x,y) gives you things in x not in y. setdiff(y,x) gives you things in y not in x
setdiff(biomark.cmptargets, pc.genes)
# print(paste("\n length:", length(setdiff(biomark.cmptargets, pc.genes))))
writeLines(c("", "length:", length(setdiff(biomark.cmptargets, pc.genes))))

Umm, yeah that went kinda how I expected. Let’s do this again, but for the actual biomark gene lists.

biomark <- read.table(file = "/Users/heustonef/Desktop/CMPSubpops/BioMark/ProbePanels/BiomarkProbeList.txt", sep = "\t")
biomark <- biomark[,1]
setdiff(biomark, pc.genes)
writeLines(c("", "length:", length(setdiff(biomark, pc.genes))))

What if we increase the number of pcs but decrease the depth of each? This might cover more of biomark, which was originally developed using msAggr instead of only the CMP subset

pc.list <- c("PC_1", "PC_2", "PC_3", "PC_4", "PC_5", "PC_6", "PC_7", "PC_8", "PC_9", "PC_10")
pc.genes <- lapply(pc.list, function(x) rownames(pca.df[pca.df[[x]] %in% sort(pca.df[[x]], decreasing = TRUE)[1:20],]))
pc.genes <- unique(unlist(pc.genes))
print(paste("total gene count:", length(pc.genes)))
setdiff(biomark, pc.genes)
writeLines(c("", "length:", length(setdiff(biomark, pc.genes))))

For comparison, let’s just see how many of biomark.cmptargets were actually included in biomark

setdiff(biomark.cmptargets, biomark)
writeLines(c("", "length:", length(setdiff(biomark.cmptargets, pc.genes))))
length(biomark) - length(setdiff(biomark, biomark.cmptargets))
length(biomark) - length(setdiff(biomark, pc.genes))

So when you look at it like that, it’s not actually that far off.

What are the similarities?:

setdiff(setdiff(biomark, biomark.cmptargets), setdiff(biomark, pc.genes))

These are genes from the 97probes not in the old CMP set that are also not in the new CMP set. Other than Itga2b (which is a failed probe anyway), nothing screams. Also we’d have thrown Flt3 and Cd34 for in anyway because they’re requisite cell surface markers (also Flt3 surface marker is expensive but otherwise not noteworthy and not used in the current sorting strategy)

What about cell surface marker expression? * Cd34 * Cd16/32 * Cd9 * Cd41 * Cd48 * Sca1 (just throw that in for sh*&s and giggles)

surface.markers <- c("Cd34", "Fcgr3", "Fcgr2b", "Cd9", "Itga2b", "Cd48", "Ly6a")
FeaturePlot(cmp.object, features = surface.markers, pt.size = 1, split.by = "clust.ID", ncol = 1)

Save as png

png(filename = "FeaturePlot_CMP_surfaceMarkers_clustIDfacet.png", height = 1600, width = 1600)
FeaturePlot(cmp.object, features = surface.markers, pt.size = 1, split.by = "clust.ID", ncol = 1)
dev.off()

Cell cycle analysis

Just so we know what we’re working with

s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

Compare @ hierarchcial clusteirng

Do clustering using biomark RNAs as input

# Read in BiomarkRNAs
biomark.rnas <- read.table('/Users/heustonef/Desktop/10XGenomicsData/BiomarkRNAs.txt')
biomark.rnas <- biomark.rnas$V1

use biomark RNAs to define dimensional reduction

cmp.object <- readRDS("CMP_raw.RDS")
cmp.object <- RunPCA(cmp.object, features = biomark.rnas, ndims.print = 1:5, , nfeatures.print = 5)
ElbowPlot(cmp.object, ndims = 50)

Now run the clustering

tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
ndims <- length(which(cumsum(tot.var) <= 90))

cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
cmp.object <- FindClusters(cmp.object, resolution = 0.5)
cmp.object <- RunUMAP(cmp.object, dims = 1: ndims)

find the clusters

for(x in c(0.5, 1, 1.5, 2, 2.5)){
    cmp.object <- FindClusters(cmp.object, resolution = x)
}

Plot the umaps and cell cluster ids

for (meta.col in colnames(cmp.object@meta.data)){
    if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
        myplot <- DimPlot(cmp.object, 
                                            group.by = meta.col,
                                            reduction = "umap", 
                                            cols = color.palette
                                            ) + 
            ggtitle(paste0(projectName, " dim", ndims, "res", gsub("RNA_snn_res", "", meta.col) ))
        plot(myplot)
    }
}

Calculate anticipated number of cells you’ll find in each biomark cluster

Get # cells in each cluster

tot.cellcount <- nrow(cmp.object@meta.data)
res05.list <- sort(unique(cmp.object@meta.data$RNA_snn_res.0.5), decreasing = FALSE)
sapply(res05.list, 
             function(x){
                print(
                    paste(
                        "cluster", x, "=", 
                        nrow(cmp.object@meta.data[cmp.object@meta.data$RNA_snn_res.0.5 == x,]), 
                        "cells or", 
                        round(nrow(cmp.object@meta.data[cmp.object@meta.data$RNA_snn_res.0.5 == x,])/tot.cellcount*100, digits = 2), 
                        "% of total"
                    )
                )
             }
            )

So we did the dimensional reduction based on the biomark RNAs, then did our UMAP nearest neighbor clustering.

In the biomark hierarchcial clustering analysis I assayed 167 cells. The smallest cluster I detected had 3 cells, or 1.8% of total, and this is an uncomfortably small number of cells. Based on the UMAP calculations I would therefore expect to find 11 or 12 of the predicted 15 clusters. I found 12, and I don’t really like that last one, so 11 or 12. Since I did the hierarchcial clustering yesterday and did this math today, we can say it was independent of these results and therefore totally legit. Yay!!

---
title: "CMPSubset"
output: html_notebook
---

Creating new pipeline using seurat v4.0.2 available 2021.06.08

Load libraries required for Seuratv4

```{r setup}
knitr::opts_knit$set(root.dir = "~/Desktop/10XGenomicsData/msAggr_scRNASeq/")
library(dplyr)
library(Seurat)
library(patchwork)
library(ggplot2)
# library(clustree)
```
store session info
```{r }
# sink("CMP-v1.20210616.txt")
sessionInfo()
# sink()
```

# Set global variables
```{r}
projectName <- "CMP"
jackstraw.dim <- 40
```



```{r}
source("msAggr_AnalysisCode/read_10XGenomics_data.R")
source("msAggr_AnalysisCode/PercentVariance.R")
source("msAggr_AnalysisCode/Mouse2Human_idconversion.R")
```


```{r}
setwd("../cellRanger/") # temporarily changing wd only works if you run the entire chunk at once
data_file.list <- read_10XGenomics_data(sample.list = "CMPm2")
object.data <-Read10X(data_file.list)
```



```{r}
cmp.object<- CreateSeuratObject(counts = object.data, min.cells = 3, min.genes = 200, project = projectName)
```

Clean up to free memory

```{r}
remove(object.data)
```


Add mitochondrial metadata and plot some basic features
```{r}
cmp.object[["percent.mt"]] <- PercentageFeatureSet(cmp.object, pattern = "^mt-")
VlnPlot(cmp.object, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0, fill.by = 'orig.ident', )
```


```{r}
plot1 <- FeatureScatter(cmp.object, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "orig.ident", pt.size = 0.01)
plot2 <- FeatureScatter(cmp.object, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "orig.ident", pt.size = 0.01)
plot1 + plot2
```

We don't have to worry about comparing library depths, so we'll just do normalization/Scale data



remove low quality cells
require: nFeature_RNA between 200 and 4000 (inclusive)
require: percent.mt <=5

```{r}
print(paste("original object:", nrow(cmp.object@meta.data), "cells", sep = " "))
cmp.object <- subset(cmp.object, 
												subset = nFeature_RNA >=200 & 
													nFeature_RNA <= 4000 & 
													percent.mt <= 5
												)
print(paste("new object:", nrow(cmp.object@meta.data), "cells", sep = " "))
```



```{r}
cmp.object <- NormalizeData(cmp.object, normalization.method = "LogNormalize", scale.factor = 10000)
```


Find variable features
```{r fig.width = 5, fig.height = 2}
cmp.object <- FindVariableFeatures(cmp.object, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(cmp.object), 10)
plot1 <- VariableFeaturePlot(cmp.object)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

```

Scale data (linear transformation)

```{r echo = FALSE}
all.genes <- rownames(cmp.object)
cmp.object <- ScaleData(cmp.object, features = all.genes, vars.to.regress = c("nFeature_RNA", "nCount_RNA"))
```

## Save raw object
```{r}
saveRDS(cmp.object, file = paste0(projectName, "_raw.RDS"))
```



```{r}
cmp.object <- RunPCA(cmp.object, features = VariableFeatures(cmp.object), ndims.print = 1:5, nfeatures.print = 5)
```

```{r}
DimPlot(cmp.object, reduction = "pca", group.by = "orig.ident")
VizDimLoadings(cmp.object, dims = 1:6, nfeatures = 10, reduction = "pca", ncol = 3)

```

Calculate dimensionality
```{r, figures-side, fig.show='hold', out.width="50%"}
ElbowPlot(cmp.object, ndims = 50)
percent.variance(cmp.object@reductions$pca@stdev)
```
Number of PCs describing X% of variance

```{r}
tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
paste0("Num pcs for 80% variance:", length(which(cumsum(tot.var) <= 80)))
paste0("Num pcs for 85% variance:", length(which(cumsum(tot.var) <= 85)))
paste0("Num pcs for 90% variance:", length(which(cumsum(tot.var) <= 90)))
paste0("Num pcs for 95% variance:", length(which(cumsum(tot.var) <= 95)))

```

# Add cluster IDs from Seurat v1

Exported cell IDs for clusters 3, 17, 10, 11 from Seurat v1. Will add these IDs as a metadata column.  
Create column "clust.ID" and populate with 0's. Then import IDs for clusters



```{r}
clust3.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster3cellIDs.txt", col.names = "clust03")
clust3.cells <- sapply(clust3.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
clust17.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster17cellIDs.txt", col.names = "clust17")
clust17.cells <- sapply(clust17.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
clust10.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster10cellIDs.txt", col.names = "clust10")
clust10.cells <- sapply(clust10.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
clust11.cells <- read.table(file = "Seuratv1_clusterCellIDs/cluster11cellIDs.txt", col.names = "clust11")
clust11.cells <- sapply(clust11.cells, function(x) paste0(gsub("CMP", "CMPm2", x), "-1"))
```

Add new metadata column
```{r}
cmp.object@meta.data['clust.ID'] <- 0
head(cmp.object@meta.data)
```

now map new ids
```{r}
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust3.cells] <- 3
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust17.cells] <- 17
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust10.cells] <- 10
cmp.object@meta.data$clust.ID[rownames(cmp.object@meta.data) %in% clust11.cells] <- 11
```

do numbers make sense?
```{r}
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 10,])
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 11,])
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 17,])
nrow(cmp.object@meta.data[cmp.object@meta.data$clust.ID == 3,])
```

### Color palette
```{r}
color.palette <- c(
	"coral",
	"chartreuse4",
	"goldenrod1",
	"cadetblue1",
	"burlywood",
	"brown",
	"brown1",
	"blue",
	"blue4",
	"azure3",
	"aquamarine",
	"antiquewhite",
	"cadetblue",
	"gold3",
	"black",
	"darkgreen",
	"deeppink",
	"darkviolet",
	"darkturquoise",
	"darkslategray",
	"darksalmon",
	"darkorchid1",
	"darkolivegreen2",
	"forestgreen",
	"dodgerblue",
	"green",
	"lightpink",
	"lightcoral",
	"khaki1",
	"maroon",
	"peru",
	"lightseagreen",
	"lightsalmon",
	"plum",
	"moccasin",
	"tan",
	"tan1", 
	"red", 
	"purple",
	"khaki4",
	"black", 
	"plum4"
)
```

# Total var 90%
## Neighborhood and umap
set total.var <- 90%
```{r}
tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
ndims <- length(which(cumsum(tot.var) <= 90))

cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
cmp.object <- FindClusters(cmp.object, resolution = 0.5)
cmp.object <- RunUMAP(cmp.object, dims = 1: ndims)

saveRDS(cmp.object, file = paste0(projectName, "_dim", ndims, ".RDS"))
```
Plot UMAP

```{r}
for(x in c(0.5, 1, 1.5, 2, 2.5)){
	cmp.object <- FindClusters(cmp.object, resolution = x)
}
```

```{r}
for (meta.col in colnames(cmp.object@meta.data)){
	if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
		myplot <- DimPlot(cmp.object, 
											group.by = meta.col,
											reduction = "umap", 
											cols = color.palette
											) + 
			ggtitle(paste0(projectName, " dim", ndims, "res", gsub("RNA_snn_res", "", meta.col) ))
		plot(myplot)
	}
}
```


for each resolution, number/percentage of cells in each cluster?

```{r}
tot.cells <- nrow(cmp.object@meta.data)
for (meta.col in colnames(cmp.object@meta.data)){
	if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
		new.clusters <- sort(as.numeric(levels(cmp.object@meta.data[[meta.col]])))
		stats.df <- data.frame(matrix(ncol = 2, nrow = length(new.clusters)))
		colnames(stats.df) <- c("num_cells", "pct_pop")
		rownames(stats.df) <- new.clusters
		meta.df <- cmp.object@meta.data
		for(row.id in rownames(stats.df)){
				num.x <- nrow(meta.df[meta.df[meta.col] == row.id,])
				pct.x <- as.integer(num.x / tot.cells *100)
				# print(pct.x)
				stats.df[row.id, "num_cells"] <- num.x
				stats.df[row.id, "pct_pop"] <- pct.x
		}
		print(stats.df)
	}
}
```



For each resolution, what percentage of cells in each cluster are enriched for one of our clust.IDs?


Test: what percentage of each new clusterID matches one of the older clusters?
```{r}
for (meta.col in colnames(cmp.object@meta.data)){
	if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
		new.clusters <- sort(as.numeric(levels(cmp.object@meta.data[[meta.col]])))
		enrich.df <- data.frame(matrix(ncol = 4, nrow = length(new.clusters)))
		colnames(enrich.df) <- c(3, 17, 10, 11)
		rownames(enrich.df) <- new.clusters
		meta.df <- cmp.object@meta.data
		for(row.id in rownames(enrich.df)){
			tot.clus <- nrow(meta.df[meta.df[[meta.col]] == row.id,])
			for(col.id in colnames(enrich.df)){
				num.x <- nrow(meta.df[(meta.df[[meta.col]] == row.id) & (meta.df$clust.ID == col.id),])
				pct.x <- as.integer(num.x / tot.clus *100)
				# print(pct.x)
				enrich.df[row.id, col.id] <- pct.x
			}
		}
		colnames(enrich.df) <- sapply(colnames(enrich.df), function(x) paste0("oldcluster", x))
		rownames(enrich.df) <- sapply(rownames(enrich.df), function(x) paste0("newcluster", x))
		xlsx::write.xlsx(enrich.df, file = paste0("PctOfNewClustersOverlappingOldClusters_", projectName, "_dim", ndims, ".xlsx"), sheetName = paste0(gsub("RNA_snn_", "", meta.col)), append = TRUE)
		print(enrich.df)
	}
}

```
Absolutely terrible overlap, no enrichment of any of these across the new clustering algorithm. Maybe should try 95% variation covered

## Find old cells on UMAP

time for the super scarey moment to see if the cells from seuratv1 still cluster together on in seurat v4

```{r fig.width = 4}
DimPlot(cmp.object,
				reduction = "umap",
				group.by = "clust.ID", 
				# split.by = "orig.ident",
				cols = c("gray", "orange", "blue", "red", "green"),)
```
```{r fig.width = 4}
DimPlot(cmp.object,
				reduction = "umap",
				group.by = "orig.ident", 
				split.by = "clust.ID",
				cols = c("gray", "orange", "blue", "red", "green"),)
```


# Total var 95%
## Neighborhood and umap
set total.var <- 95%
```{r}
tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
ndims <- length(which(cumsum(tot.var) <= 95))

cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
cmp.object <- FindClusters(cmp.object, resolution = 0.5)
cmp.object <- RunUMAP(cmp.object, dims = 1: ndims)

saveRDS(cmp.object, file = paste0(projectName, "_dim", ndims, ".RDS"))
```
Plot UMAP

```{r}
for(x in c(0.5, 1, 1.5, 2, 2.5)){
	cmp.object <- FindClusters(cmp.object, resolution = x)
}
```



For each resolution, what percentage of cells in each cluster are enriched for one of our clust.IDs?


Test: what percentage of each new clusterID matches one of the older clusters?
```{r}
for (meta.col in colnames(cmp.object@meta.data)){
	if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
		new.clusters <- sort(as.numeric(levels(cmp.object@meta.data[[meta.col]])))
		enrich.df <- data.frame(matrix(ncol = 4, nrow = length(new.clusters)))
		colnames(enrich.df) <- c(3, 17, 10, 11)
		rownames(enrich.df) <- new.clusters
		meta.df <- cmp.object@meta.data
		for(row.id in rownames(enrich.df)){
			tot.clus <- nrow(meta.df[meta.df[[meta.col]] == row.id,])
			for(col.id in colnames(enrich.df)){
				num.x <- nrow(meta.df[(meta.df[[meta.col]] == row.id) & (meta.df$clust.ID == col.id),])
				pct.x <- as.integer(num.x / tot.clus *100)
				# print(pct.x)
				enrich.df[row.id, col.id] <- pct.x
			}
		}
		colnames(enrich.df) <- sapply(colnames(enrich.df), function(x) paste0("oldcluster", x))
		rownames(enrich.df) <- sapply(rownames(enrich.df), function(x) paste0("newcluster", x))
		xlsx::write.xlsx(enrich.df, file = paste0("PctOfNewClustersOverlappingOldClusters_", projectName, "_dim", ndims, ".xlsx"), sheetName = paste0(gsub("RNA_snn_", "", meta.col)), append = TRUE)
		print(enrich.df)
	}
}

```
Absolutely terrible overlap, no enrichment of any of these across the new clustering algorithm. Maybe should try 95% variation covered

## Find old cells on UMAP

time for the super scarey moment to see if the cells from seuratv1 still cluster together on in seurat v4

```{r fig.width = 2}
DimPlot(cmp.object,
				reduction = "umap",
				group.by = "clust.ID", 
				pt.size = .1,
				# split.by = "orig.ident",
				cols = c("gray", "orange", "blue", "red", "green"),)
```
```{r fig.width = 4}
DimPlot(cmp.object,
				reduction = "umap",
				group.by = "orig.ident", 
				split.by = "clust.ID",
				cols = c("gray", "orange", "blue", "red", "green"),)
```



### Gene expression of old clustrs on new map
Let's see if we can get some gene expression profiles on these...
```{r, fig.height=10, fig.width=18}
gene.list <- c("Gata1", "Gata2", "Pf4", "Dntt", "Mpo", "Meis1", "Irf8", "Elane", "Fli1", "Zfpm1")
VlnPlot(cmp.object, features = gene.list, group.by = "clust.ID", pt.size = 0.01, cols = c("gray", "orange", "blue", "red", "green"))
```


Used the exce doc to do some fancy conditional formatting. Old cluster 17 is pretty dispersed until you it resolution 2.5. Otherise, cells in old cluster 17 do not constitute more than 40% of any cells in the new clusters.  
As far as I can see, the two approaches are to do DGEof new CMP w/ resolution = 2.5, AND/OR do DGe using older cluster IDs. Sure seems to make sense to do both...


# DGE w/ resolution = 2.5
Strt with comparing all clusters against all other clusters
Write out cluster info


calculate `FindAllMarkers()` for different idents and save to new file
```{r}
ident.list <- c("RNA_snn_res.0.5", "RNA_snn_res.1", "RNA_snn_res.1.5", "RNA_snn_res.2", "RNA_snn_res.2.5", "clust.ID")
for(tested.ident in ident.list){
	Idents(cmp.object) <- tested.ident
	all.markers <- FindAllMarkers(cmp.object)
	xlsx::write.xlsx(x = all.markers[,c("avg_log2FC", "p_val_adj", "cluster", "gene")], 
									 file = paste0(projectName, "_FindALLMarkers_res2.5.xlsx"), 
									 sheetName = tested.ident, 
									 col.names = TRUE, 
									 row.names = FALSE, 
									 append = TRUE)
}
```

Create `FindAllMarkers()` lists for GSEA
```{r}
Idents(cmp.object) <- "RNA_snn_res.2.5"
res.2.5.allmarkers <- FindAllMarkers(cmp.object)
```

Map HGNC symbols
```{r}
Mouse2HumanTable <- Mouse2Human(res.2.5.allmarkers$gene)

HGNC <- with(Mouse2HumanTable, Mouse2HumanTable$HGNC[match(res.2.5.allmarkers$gene, Mouse2HumanTable$MGI)])
head(res.2.5.allmarkers)
res.2.5.allmarkers$HGNC <- HGNC
tail(res.2.5.allmarkers)
sig.res.2.5 <- res.2.5.allmarkers[res.2.5.allmarkers$p_val_adj <= 0.05, ]
sig.res.2.5 <- sig.res.2.5[c("avg_log2FC", "HGNC", "cluster")]
sig.res.2.5 <- sig.res.2.5[!(sig.res.2.5$HGNC == "" | is.na(sig.res.2.5$HGNC)),] # GSEA will fail if there are any blanks or NAs in the table
sig.res.2.5 <- sig.res.2.5[]

```


```{r}
for(cluster in unique(sig.res.2.5$cluster)){
	print(paste("writing cluster", cluster))
	new.table <- sig.res.2.5[sig.res.2.5$cluster == cluster, c("HGNC", "avg_log2FC")]
	new.table <- new.table[order(-new.table$avg_log2FC), ]
	write.table(new.table, file = paste0("RankList_res2.5_findAll_hgnc/res.2.5cluster", cluster, ".rnk"), quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t", )
	
}
```



calculate `FindMarkers()` that distinguish each cluster (might overlab between clusters)
```{r}
ident.list <- c("RNA_snn_res.0.5", "RNA_snn_res.1", "RNA_snn_res.1.5", "RNA_snn_res.2", "RNA_snn_res.2.5", "clust.ID")
for(tested.ident in ident.list){
	for (cluster in sort(as.numeric(levels(cmp.object@meta.data[[tested.ident]])))){
	cluster.markers <- FindMarkers(cmp.object, ident.1 = cluster)
	xlsx::write.xlsx(x = cluster.markers[,c("avg_log2FC", "p_val_adj")], 
									 file = paste0(projectName, "_FindMarkers_", gsub("RNA_snn_", "", tested.ident), ".xlsx"), 
									 sheetName = paste0("clst", cluster), 
									 col.names = TRUE, 
									 row.names = TRUE, 
									 append = TRUE)
}
}
```



```{r}
for (cluster in sort(as.numeric(levels(cmp.object@meta.data$RNA_snn_res.2.5)))){
	cluster.markers <- FindMarkers(cmp.object, ident.1 = cluster)
	xlsx::write.xlsx(x = cluster.markers[,c("avg_log2FC", "p_val_adj")], 
									 file = paste0(projectName, "_FindMarkers_res2.5.xlsx"), 
									 sheetName = paste0("clst", cluster), 
									 col.names = TRUE, 
									 row.names = TRUE, 
									 append = TRUE)
}
```

## Combine clusters that might represent old cluster ids

# DGE w/ metadata against clust.ID against "0"
reset ident as "clust.ID" and rerun `FindAllMarkers()`
```{r}
	Idents(cmp.object) <- "clust.ID"
	all.markers <- FindAllMarkers(cmp.object)
	xlsx::write.xlsx(x = all.markers[,c("avg_log2FC", "p_val_adj", "cluster", "gene")], 
									 file = paste0(projectName, "_FindALLMarkers_clustID.xlsx"), 
									 sheetName = "clustID", 
									 col.names = TRUE, 
									 row.names = FALSE, 
									 append = TRUE)
```


```{r}
# Idents(cmp.object) <- "clust.ID"
for (cluster in unique(cmp.object@meta.data$clust.ID)){
	print(cluster)
	cluster.markers <- FindMarkers(cmp.object, ident.1 = cluster)
	xlsx::write.xlsx(x = cluster.markers[,c("avg_log2FC", "p_val_adj")], 
									 file = paste0(projectName, "_FindMarkers_clustID.xlsx"), 
									 sheetName = paste0("oldclust", cluster), 
									 col.names = TRUE, 
									 row.names = TRUE, 
									 append = TRUE)
}

```


# Distinguishing features of clusters
Previously defined biomark genes based on PC contributions. Original list was based on *all* msAggr, but let's see how CMP subset does?
```{r fig.height = 30, fig.width=6}
VizDimLoadings(cmp.object, dims = 1:10, nfeatures = 30, reduction = "pca", ncol = 2)
```

```{r}
pca.df <- cmp.object[["pca"]]
pca.df <- as.data.frame(as.matrix(slot(object = pca.df, name = "feature.loadings")))
print(cmp.object[["pca"]], dims = 2, nfeatures = 5)
rownames(pca.df[pca.df$PC_2 %in% sort(pca.df$PC_2, decreasing = TRUE)[1:5], ])
rownames(pca.df[pca.df$PC_2 %in% sort(pca.df$PC_2)[1:5], ])
```

now we can get a list of principal components!  
first pull the list of oldAnalysis CMP top PC genes
```{r}
cmp.biomark <- read.table(file = "/Users/heustonef/Desktop/CMPSubpops/BioMark/ProbePanels/CMP_PCTopGenes.txt", sep = "\t", header = TRUE)
biomark.cmptargets <- c()
for(df.col in 1:ncol(cmp.biomark)){
	biomark.cmptargets <- c(biomark.cmptargets, biomark[,df.col])
}
print(colnames(biomark))
print(paste("total gene count:", length(biomark.cmptargets)))
```

Now get the list of current pc gene trgets (oldAnalysis used ndim = 1:6, so we'll start with that range)
```{r}
pc.list <- c("PC_1", "PC_2", "PC_3", "PC_4", "PC_5", "PC_6")
pc.genes <- lapply(pc.list, function(x) rownames(pca.df[pca.df[[x]] %in% sort(pca.df[[x]], decreasing = TRUE)[1:30],])) #targeting roughly 180 genes like in biomark.cmptargets
pc.genes <- unique(unlist(pc.genes))
print(paste("total gene count:", length(pc.genes)))
```

Now compare the lists, I guess:

```{r}
# setdiff(x,y) gives you things in x not in y. setdiff(y,x) gives you things in y not in x
setdiff(biomark.cmptargets, pc.genes)
# print(paste("\n length:", length(setdiff(biomark.cmptargets, pc.genes))))
writeLines(c("", "length:", length(setdiff(biomark.cmptargets, pc.genes))))
```
Umm, yeah that went kinda how I expected. Let's do this again, but for the actual biomark gene lists.
```{r}
biomark <- read.table(file = "/Users/heustonef/Desktop/CMPSubpops/BioMark/ProbePanels/BiomarkProbeList.txt", sep = "\t")
biomark <- biomark[,1]
setdiff(biomark, pc.genes)
writeLines(c("", "length:", length(setdiff(biomark, pc.genes))))
```


What if we increase the number of pcs but decrease the depth of each? This might cover more of `biomark	`, which was originally developed using msAggr instead of only the CMP subset
```{r}
pc.list <- c("PC_1", "PC_2", "PC_3", "PC_4", "PC_5", "PC_6", "PC_7", "PC_8", "PC_9", "PC_10")
pc.genes <- lapply(pc.list, function(x) rownames(pca.df[pca.df[[x]] %in% sort(pca.df[[x]], decreasing = TRUE)[1:20],]))
pc.genes <- unique(unlist(pc.genes))
print(paste("total gene count:", length(pc.genes)))

```
```{r}
setdiff(biomark, pc.genes)
writeLines(c("", "length:", length(setdiff(biomark, pc.genes))))
```

For comparison, let's just see how many of `biomark.cmptargets` were actually included in `biomark`
```{r}
setdiff(biomark.cmptargets, biomark)
writeLines(c("", "length:", length(setdiff(biomark.cmptargets, pc.genes))))
```
```{r}
length(biomark) - length(setdiff(biomark, biomark.cmptargets))
```
```{r}
length(biomark) - length(setdiff(biomark, pc.genes))
```
So when you look at it like that, it's not actually that far off.


What are the similarities?:
```{r}
setdiff(setdiff(biomark, biomark.cmptargets), setdiff(biomark, pc.genes))
```
These are genes from the 97probes not in the old CMP set that are also not in the new CMP set. Other than Itga2b (which is a failed probe anyway), nothing screams. Also we'd have thrown Flt3 and Cd34 for in anyway because they're requisite cell surface markers (also Flt3 surface marker is expensive but otherwise not noteworthy and not used in the current sorting strategy)

What about cell surface marker expression?
* Cd34
* Cd16/32
* Cd9
* Cd41
* Cd48
* Sca1 (just throw that in for sh*&s and giggles)
```{r, fig.height = 15, fig.width=10}
surface.markers <- c("Cd34", "Fcgr3", "Fcgr2b", "Cd9", "Itga2b", "Cd48", "Ly6a")
FeaturePlot(cmp.object, features = surface.markers, pt.size = 1, split.by = "clust.ID", ncol = 1)
```
Save as png
```{r}
png(filename = "FeaturePlot_CMP_surfaceMarkers_clustIDfacet.png", height = 1600, width = 1600)
FeaturePlot(cmp.object, features = surface.markers, pt.size = 1, split.by = "clust.ID", ncol = 1)
dev.off()
```





# Cell cycle analysis
Just so we know what we're working with


```{r}
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes
```


# Compare @ hierarchcial clusteirng

Do clustering using biomark RNAs as input
```{r}
# Read in BiomarkRNAs
biomark.rnas <- read.table('/Users/heustonef/Desktop/10XGenomicsData/BiomarkRNAs.txt')
biomark.rnas <- biomark.rnas$V1
```

use biomark RNAs to define dimensional reduction
```{r}
cmp.object <- readRDS("CMP_raw.RDS")
cmp.object <- RunPCA(cmp.object, features = biomark.rnas, ndims.print = 1:5, , nfeatures.print = 5)
ElbowPlot(cmp.object, ndims = 50)
```


Now run the clustering
```{r}
tot.var <- percent.variance(cmp.object@reductions$pca@stdev, plot.var = FALSE, return.val = TRUE)
ndims <- length(which(cumsum(tot.var) <= 90))

cmp.object <- FindNeighbors(cmp.object, dims = 1:ndims)
cmp.object <- FindClusters(cmp.object, resolution = 0.5)
cmp.object <- RunUMAP(cmp.object, dims = 1: ndims)


```

find the clusters

```{r}
for(x in c(0.5, 1, 1.5, 2, 2.5)){
	cmp.object <- FindClusters(cmp.object, resolution = x)
}
```

Plot the umaps and cell cluster ids
```{r}
for (meta.col in colnames(cmp.object@meta.data)){
	if(grepl(pattern = ("RNA_snn_res"), x = meta.col)==TRUE){
		myplot <- DimPlot(cmp.object, 
											group.by = meta.col,
											reduction = "umap", 
											cols = color.palette
											) + 
			ggtitle(paste0(projectName, " dim", ndims, "res", gsub("RNA_snn_res", "", meta.col) ))
		plot(myplot)
	}
}
```
### Calculate anticipated number of cells you'll find in each biomark cluster
Get # cells in each cluster

```{r}
tot.cellcount <- nrow(cmp.object@meta.data)
res05.list <- sort(unique(cmp.object@meta.data$RNA_snn_res.0.5), decreasing = FALSE)
sapply(res05.list, 
			 function(x){
			 	print(
			 		paste(
			 			"cluster", x, "=", 
			 			nrow(cmp.object@meta.data[cmp.object@meta.data$RNA_snn_res.0.5 == x,]), 
			 			"cells or", 
			 			round(nrow(cmp.object@meta.data[cmp.object@meta.data$RNA_snn_res.0.5 == x,])/tot.cellcount*100, digits = 2), 
			 			"% of total"
			 		)
			 	)
			 }
			)
```

So we did the dimensional reduction based on the biomark RNAs, then did our UMAP nearest neighbor clustering.


In the biomark hierarchcial clustering analysis I assayed 167 cells. The smallest cluster I detected had 3 cells, or 1.8% of total, and this is an uncomfortably small number of cells. Based on the UMAP calculations I would therefore expect to find 11 or 12 of the predicted 15 clusters. I found 12, and I don't really like that last one, so 11 or 12. Since I did the hierarchcial clustering yesterday and did this math today, we can say it was independent of these results and therefore totally legit. Yay!!

